import pandas as pd
import numpy as np
df1=pd.read_csv("Player_data.csv",encoding="UTF-8")
df2=pd.read_csv("Players.csv",encoding="UTF-8")
df3=pd.read_csv("Seasons_Stats.csv",encoding="UTF-8")
#读取三个数据集
merged_df1 = pd.merge(df1, df2, left_on='name', right_on='Player', how='outer')
merged_df = pd.merge(merged_df1, df3, left_on='name', right_on='Player', how='outer')
#按相同名字合并数据集
merged_df=merged_df.drop(merged_df.columns[8],axis=1)
merged_df=merged_df.drop(merged_df.columns[15],axis=1)
merged_df=merged_df.drop(merged_df.columns[4:9],axis=1)
merged_df=merged_df.drop(merged_df.columns[10:13],axis=1)
merged_df=merged_df.drop(merged_df.columns[13:15],axis=1)
merged_df=merged_df.drop(merged_df.columns[25],axis=1)
merged_df=merged_df.drop(merged_df.columns[29],axis=1)
merged_df=merged_df.drop(merged_df.columns[8],axis=1)
merged_df=merged_df.drop(merged_df.columns[9],axis=1)
merged_df = merged_df.dropna(subset=['name'])
#去除重复的名字和无用数据列标题
merged_df.rename(columns={'height_y': 'height', 'weight_y': 'weight'}, inplace=True)
#修正列标题
merged_df=merged_df.dropna(how="all")
#去除全空数据
merged_df = merged_df.drop_duplicates(subset=['name'], keep='first')
#按照名字整理数据，只保留对应名字的第一次出现
columns_check=[col for col in merged_df.columns if col not in ["birth_state","collage"]]
df_cleaned = merged_df.dropna(subset=columns_check)
#清除除了出生洲和大学外含有空缺信息的数据
df_cleaned.to_csv("Player_cleaned.csv",index=False)
#保存为文件